In [1]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=1)

# given some content...
content = ["How to format my hard disk", " Hard disk format problems "]

X = vectorizer.fit_transform(content)
feature_names = vectorizer.get_feature_names()

print("Feature names: {}".format(feature_names))
print(X.toarray().transpose())


Feature names: ['disk', 'format', 'hard', 'how', 'my', 'problems', 'to']
[[1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 0]
 [0 1]
 [1 0]]

Description of above data: Each column gives us a boolean (1 or 0) value letting us know if each word appears in the sentence (from content). Sentence 1 (content[0]) contains all words but "problems".


In [9]:
posts = [
    "This is a toy post about machine learning. Actually, it contains not much interesting stuff.",
    "Imaging databases provide storage capabilities.",
    "Most imaging databases save images permanently.",
    "Imaging databases store data.",
    "Imaging databases store data. Imaging databases store data. Imaging databases store data.",
]

# Create a training set
vectorizer = CountVectorizer(min_df=1)
X_train = vectorizer.fit_transform(posts)
num_samples, num_features = X_train.shape
print("#samples: {}, #features: {}".format(num_samples, num_features))
print(vectorizer.get_feature_names())

# create a new post
new_post = "imaging databases"
new_post_vec = vectorizer.transform([new_post])

# a naive similarity measure (which uses the full ndarray of the new post)
import scipy as sp
def dist_raw(v1, v2):
    delta = v1 - v2
    # norm: Euclidean norm (shortest distance)
    return sp.linalg.norm(delta.toarray())

# Find distances among all posts
import sys

def find_distances(vectorizer, new_post, posts, dist_func=dist_raw):
    X_train = vectorizer.fit_transform(posts)
    new_post_vec = vectorizer.traansform([new_post])
    num_samples, num_features = X_train.shape
    
    print("----------------------------------------")
    print("#samples: {}, #features: {}".format(num_samples, num_features))
    print(vectorizer.get_feature_names())
    print("----------------------------------------")
    
    best_dist = sys.maxsize
    best_i = None

    for i in range(0, num_samples):
        post = posts[i]
        if post == new_post:
            continue
        post_vec = X_train.getrow(i)
        
        d = dist_func(post_vec, new_post_vec)

        print("- Post %i with dist=%.2f: %s" % (i, d, post))

        if d < best_dist:
            best_dist = d
            best_i = i

    print("Best post is %i with dist=%.2f" % (best_i, best_dist))

find_distances(new_post, posts, X_train)

# explore the vectors for posts 3 & 4 since they all contain the same words
print("\nVectors for what should be similar sentences:")
print(X_train.getrow(3).toarray())
print(X_train.getrow(4).toarray())


#samples: 5, #features: 25
['about', 'actually', 'capabilities', 'contains', 'data', 'databases', 'images', 'imaging', 'interesting', 'is', 'it', 'learning', 'machine', 'most', 'much', 'not', 'permanently', 'post', 'provide', 'save', 'storage', 'store', 'stuff', 'this', 'toy']
----------------------------------------
#samples: 5, #features: 25
['about', 'actually', 'capabilities', 'contains', 'data', 'databases', 'images', 'imaging', 'interesting', 'is', 'it', 'learning', 'machine', 'most', 'much', 'not', 'permanently', 'post', 'provide', 'save', 'storage', 'store', 'stuff', 'this', 'toy']
----------------------------------------
- Post 0 with dist=4.00: This is a toy post about machine learning. Actually, it contains not much interesting stuff.
- Post 1 with dist=1.73: Imaging databases provide storage capabilities.
- Post 2 with dist=2.00: Most imaging databases save images permanently.
- Post 3 with dist=1.41: Imaging databases store data.
- Post 4 with dist=5.10: Imaging databases store data. Imaging databases store data. Imaging databases store data.
Best post is 3 with dist=1.41

Vectors for what should be similar sentences:
[[0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]]
[[0 0 0 0 3 3 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0]]

In [10]:
# Normalize the vectors, and try again
def dist_norm(v1, v2):
    v1_normalized = v1 / sp.linalg.norm(v1.toarray())
    v2_normalized = v2 / sp.linalg.norm(v2.toarray())
    delta = v1_normalized - v2_normalized
    return sp.linalg.norm(delta.toarray())

find_distances(new_post, posts, X_train, dist_func=dist_norm)


----------------------------------------
#samples: 5, #features: 25
['about', 'actually', 'capabilities', 'contains', 'data', 'databases', 'images', 'imaging', 'interesting', 'is', 'it', 'learning', 'machine', 'most', 'much', 'not', 'permanently', 'post', 'provide', 'save', 'storage', 'store', 'stuff', 'this', 'toy']
----------------------------------------
- Post 0 with dist=1.41: This is a toy post about machine learning. Actually, it contains not much interesting stuff.
- Post 1 with dist=0.86: Imaging databases provide storage capabilities.
- Post 2 with dist=0.92: Most imaging databases save images permanently.
- Post 3 with dist=0.77: Imaging databases store data.
- Post 4 with dist=0.77: Imaging databases store data. Imaging databases store data. Imaging databases store data.
Best post is 3 with dist=0.77

In [11]:
# using stop words; i.e. removing "noise" / useless info
# use common english stop words (can also provide a list of specific words)
vectorizer = CountVectorizer(min_df=1, stop_words='english')
print("Some of our stop words: {}".format(", ".join(sorted(vectorizer.get_stop_words())[0:20])))

# construct a new training set
X_train = vectorizer.fit_transform(posts)
num_samples, num_features = X_train.shape
print("#samples: {}, #features: {}".format(num_samples, num_features))
print(vectorizer.get_feature_names())


Some of our stop words: a, about, above, across, after, afterwards, again, against, all, almost, alone, along, already, also, although, always, am, among, amongst, amoungst
#samples: 5, #features: 18
['actually', 'capabilities', 'contains', 'data', 'databases', 'images', 'imaging', 'interesting', 'learning', 'machine', 'permanently', 'post', 'provide', 'save', 'storage', 'store', 'stuff', 'toy']

In [12]:
# Using NLTK for stemming (reducing words to their specific word stem)
import nltk.stem as ns
s = ns.SnowballStemmer('english')
print(s.stem("graphics"))
print(s.stem("imaging"))
print(s.stem("image"))
print(s.stem("imagination"))
print(s.stem("imagine"))


graphic
imag
imag
imagin
imagin

In [13]:
# stem our posts before verctorizing
import nltk.stem
english_stemmer = nltk.stem.SnowballStemmer('english')

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super().build_analyzer()
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))

vectorizer = StemmedCountVectorizer(min_df=1, stop_words='english')
X = vectorizer.fit_transform(content)
X_train = vectorizer.fit_transform(posts)
print(vectorizer.get_feature_names())

find_distances(new_post, posts, X_train, dist_func=dist_norm)


['actual', 'capabl', 'contain', 'data', 'databas', 'imag', 'interest', 'learn', 'machin', 'perman', 'post', 'provid', 'save', 'storag', 'store', 'stuff', 'toy']
----------------------------------------
#samples: 5, #features: 17
['actual', 'capabl', 'contain', 'data', 'databas', 'imag', 'interest', 'learn', 'machin', 'perman', 'post', 'provid', 'save', 'storag', 'store', 'stuff', 'toy']
----------------------------------------
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-13-1ca3a2b5bb80> in <module>()
     13 print(vectorizer.get_feature_names())
     14 
---> 15 find_distances(new_post, posts, X_train, dist_func=dist_norm)

<ipython-input-9-3f813ea68ebb> in find_distances(new_post, posts, X_train, dist_func)
     43             continue
     44         post_vec = X_train.getrow(i)
---> 45         d = dist_func(post_vec, new_post_vec)
     46 
     47         print("- Post %i with dist=%.2f: %s" % (i, d, post))

<ipython-input-10-d5f89fa560f3> in dist_norm(v1, v2)
      3     v1_normalized = v1 / sp.linalg.norm(v1.toarray())
      4     v2_normalized = v2 / sp.linalg.norm(v2.toarray())
----> 5     delta = v1_normalized - v2_normalized
      6     return sp.linalg.norm(delta.toarray())
      7 

/Users/brad/python-stuff/ml/env/lib/python3.4/site-packages/scipy/sparse/compressed.py in __sub__(self, other)
    369         elif isspmatrix(other):
    370             if (other.shape != self.shape):
--> 371                 raise ValueError("inconsistent shapes")
    372 
    373             return self._binopt(other,'_minus_')

ValueError: inconsistent shapes